{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import sys\n",
    "import os    \n",
    "import scipy\n",
    "import sklearn\n",
    "from sklearn import metrics\n",
    "import keras\n",
    "import networkx as nx\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "os.environ['THEANO_FLAGS'] = \"device=cuda\"\n",
    "import theano"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "path_gem = \"./libs/GEM/\"\n",
    "path_data = \"./libs/GEM/gem/data/\"\n",
    "sys.path.append(path_gem)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from gem.utils import graph_util, plot_util\n",
    "from gem.evaluation import visualize_embedding as viz\n",
    "from gem.evaluation import evaluate_graph_reconstruction as gr\n",
    "from time import time\n",
    "\n",
    "#from gem.embedding.gf       import GraphFactorization\n",
    "#from gem.embedding.hope     import HOPE\n",
    "#from gem.embedding.lap      import LaplacianEigenmaps\n",
    "from gem.embedding.lle      import LocallyLinearEmbedding\n",
    "from gem.embedding.sdne     import SDNE"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Prepare data for embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load all links between articles in uk wiki\n",
    "df_uklinks = pd.read_csv('./data/links_in_uk.csv', encoding = 'UTF-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load id correspondences for uk articles and their equivalences and in enwiki\n",
    "uk_en_pairs = pd.read_csv('./data/df_uk_translated.csv.gz', encoding = 'UTF-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load all links in enwiki to red articles\n",
    "red_links_full = pd.read_csv('./data/redlinks_with_en_ids.csv', encoding = 'UTF-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# reidentify 'parent' articles of red links in terms of ukwiki\n",
    "uk_ids_for_rlinksparents = uk_en_pairs.merge(right=red_links_full, how='inner', left_on='id_en', right_on='id_x')\n",
    "uk_ids_for_rlinksparents = uk_ids_for_rlinksparents[['id_uk', 'id_y']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load red links which have at least 5 incoming links from uk wiki\n",
    "rlinks5more = pd.read_csv('./data/red_links_with_at_least_5_distinct_incoming_uk_links.csv', header=None, encoding = 'UTF-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# extract red articles which have at least 5 'parent' articles with correspondences in uk wiki\n",
    "rlinks5more.columns = ['en_id_5more']\n",
    "uk_ids_for_rlinks5more = uk_ids_for_rlinksparents.merge(right=rlinks5more, how='inner', left_on='id_y', right_on='en_id_5more')\n",
    "red_links_final = uk_ids_for_rlinks5more[['id_uk', 'en_id_5more']]\n",
    "red_links_final.columns = ['id', 'link_id']\n",
    "red_links_final.to_csv('./data/redlinks_with_parent_page_uk_id.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.1. In the graph find pieces which contain red links (up to second neigbor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load red articles which have at least 5 'parent' articles with correspondences in uk wiki\n",
    "red_art_with_parent_uk_id = pd.read_csv('./data/redlinks_with_parent_page_uk_id.csv', encoding = 'UTF-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# get all outcoming links for 'parent' article of red articles. drop duplicate links.\n",
    "\n",
    "unique_parents_for_redart = red_art_with_parent_uk_id[['id']].drop_duplicates()\n",
    "unique_parents_for_redart.columns = ['rlink_uk_par']\n",
    "\n",
    "found_in_first = df_uklinks.merge(right=unique_parents_for_redart, how='inner', left_on='id', right_on='rlink_uk_par')\n",
    "df_redart_with_parent_outcoming_links = pd.concat((found_in_first[['id','link_id']], red_art_with_parent_uk_id))\n",
    "df_redart_with_parent_outcoming_links = df_redart_with_parent_outcoming_links[['id', 'link_id']].drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of links for the grapf is 348948\n"
     ]
    }
   ],
   "source": [
    "print('number of links for the grapf is', len(df_redart_with_parent_outcoming_links))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# save the data prepared for graph embedding\n",
    "df_redart_with_parent_outcoming_links.to_csv('./data/redart_with_parent_outcoming_links.txt', sep=' ', index=False, header=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Embed graph with GEM library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# File that contains the edges. Format: source target\n",
    "edge_f = os.path.join(path_data, 'redart_with_parent_outcoming_links')\n",
    "# Specify whether the edges are directed\n",
    "isDirected = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Load graph\n",
    "G = graph_util.loadGraphFromEdgeListTxt(edge_f, directed=isDirected)\n",
    "G = G.to_directed()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num nodes: 99629, num edges: 348948\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne_utils.py:71: UserWarning: Update your `Dense` call to the Keras 2 API: `Dense(50, activation=\"relu\", kernel_regularizer=<keras.reg...)`\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne_utils.py:71: UserWarning: Update your `Dense` call to the Keras 2 API: `Dense(15, activation=\"relu\", kernel_regularizer=<keras.reg...)`\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne_utils.py:73: UserWarning: Update your `Dense` call to the Keras 2 API: `Dense(3, activation=\"relu\", kernel_regularizer=<keras.reg...)`\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne_utils.py:75: UserWarning: Update your `Model` call to the Keras 2 API: `Model(inputs=/input_7, outputs=Elemwise{m...)`\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne_utils.py:90: UserWarning: Update your `Dense` call to the Keras 2 API: `Dense(15, activation=\"relu\", kernel_regularizer=<keras.reg...)`\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne_utils.py:90: UserWarning: Update your `Dense` call to the Keras 2 API: `Dense(50, activation=\"relu\", kernel_regularizer=<keras.reg...)`\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne_utils.py:92: UserWarning: Update your `Dense` call to the Keras 2 API: `Dense(99629, activation=\"relu\", kernel_regularizer=<keras.reg...)`\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne_utils.py:96: UserWarning: Update your `Model` call to the Keras 2 API: `Model(inputs=/input_8, outputs=Elemwise{m...)`\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne_utils.py:108: UserWarning: Update your `Model` call to the Keras 2 API: `Model(inputs=/input_9, outputs=[Elemwise{...)`\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne.py:116: UserWarning: The `merge` function is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.\n",
      "/home/apps/anaconda3/envs/graphs2/lib/python3.6/site-packages/keras/legacy/layers.py:456: UserWarning: The `Merge` layer is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.\n",
      "  name=name)\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne.py:119: UserWarning: The `merge` function is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne.py:122: UserWarning: The `merge` function is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne.py:148: UserWarning: Update your `Model` call to the Keras 2 API: `Model(inputs=/x_in, outputs=[Elemwise{...)`\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne.py:161: UserWarning: The semantics of the Keras 2 argument  `steps_per_epoch` is not the same as the Keras 1 argument `samples_per_epoch`. `steps_per_epoch` is the number of batches to draw from the generator at each epoch. Update your method calls accordingly.\n",
      "/home/Katia/Power2TheWiki/libs/GEM/gem/embedding/sdne.py:161: UserWarning: Update your `fit_generator` call to the Keras 2 API: `fit_generator(generator=<generator..., verbose=1, steps_per_epoch=1373, epochs=1)`\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/1\n",
      "1373/1373 [==============================] - 1998s - loss: 13.0113 - merge_7_loss: 6.4527 - merge_8_loss: 6.4526 - merge_9_loss: 8.0276  \n",
      "sdne:\n",
      "\tTraining time: 2101.314101\n"
     ]
    }
   ],
   "source": [
    "models = []\n",
    "# You can comment out the methods you don't want to run\n",
    "\n",
    "#models.append(LocallyLinearEmbedding(d=2))\n",
    "\n",
    "models.append(SDNE(d=3, beta=5, alpha=1e-5, nu1=1e-6, nu2=1e-6, K=3,n_units=[50, 15,], rho=0.3, n_iter=1, xeta=0.02,n_batch=500,\n",
    "                  modelfile=['./results/intermediate/enc_model.json', './results/intermediate/dec_model.json'],\n",
    "                  weightfile=['./results/intermediate/enc_weights.hdf5', './results/intermediate/dec_weights.hdf5']))\n",
    "\n",
    "for embedding in models:\n",
    "    print ('Num nodes: %d, num edges: %d' % (G.number_of_nodes(), G.number_of_edges()))\n",
    "    t1 = time()\n",
    "    # Learn embedding - accepts a networkx graph or file with edge list\n",
    "    Y, t = embedding.learn_embedding(graph=G, edge_f=None, is_weighted=False, no_python=True)\n",
    "    print (embedding._method_name+':\\n\\tTraining time: %f' % (time() - t1))\n",
    "    # Visualize\n",
    "    #viz.plot_embedding2D(embedding.get_embedding(), di_graph=G, node_colors=None)\n",
    "    #plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Test embedding for finding red articles correspondences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### !!Danger. Data is too big to process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# save results of embedding and restart notebook to free memory space\n",
    "np.save('./data/learnt_emb.npy', Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# load embeddings\n",
    "Y = np.load('learnt_emb.npy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# find nearest neighbors with cosine similarity\n",
    "Y_cos_sim = sklearn.metrics.pairwise.cosine_similarity(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Y_cos_sim.shape\n",
    "# save results of cosine similarity matrix. !results in 39 GB file\n",
    "#np.save('Y_cos_sim.npy', Y_cos_sim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# process the matrix. !evokes MemoryError\n",
    "# a = Y_cos_sim - 2*np.eye(99629)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## conclusion: the experiments done show that a new way to process the data should be found. and further research on the nature of the data and embedding perspectives for that data is needed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
